"""
Code to run classification models on ATI Mexico Request data
"""
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from copy import deepcopy
from auxiliar_variables import models_dict, opt_thresholds
from math import ceil
from skmultilearn.ensemble import RakelD
import random
import sys
import re
import string
from skmultilearn.adapt import MLkNN
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.layers import Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.random import set_seed
import tensorflow as tf


re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')


def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()


def run_ECC(
        X, y_all, label_cols,
        function_name, seeds=list(range(2)), split_size=0.2):
    """
    Runs Ensemble Classifier Chain and saves results in file

    Inputs
        X : vectorized cleaned text of requests
        y_all : target labels (train and test)
        label_cols : list of labels' names
        function_name : name of the function
        seeds : seeds to generate random numbers (optional)
        split_size : percentage of test data (optional, default = 0.2)

    """
    temp = []
    num_test_req = ceil(split_size * y_all.shape[0])
    y_all = y_all[label_cols]
    for i, seed in enumerate(seeds):

        all_acc = {}
        print(f'Iteration {i+1} of {len(seeds)}')
        print(f'Running {function_name}')
        outputs_list = []
        for k in range(5):
            random.seed(k*4)
            np.random.seed(k*4)
            permute = np.random.permutation(len(label_cols))
            reorder = np.argsort(permute)

            model = ClassifierChain(LogisticRegression(
                C=1.75, solver='lbfgs', max_iter=500))

            X_train, X_test, y_train, y_test = train_test_split(
                X, y_all.iloc[:, permute].values, test_size=split_size,
                random_state=seed)
          
            vectorizer = TfidfVectorizer(
                        ngram_range=(1, 2),
                        tokenizer=tokenize,
                        encoding='utf-8',
                        use_idf=True,
                        smooth_idf=True,
                        max_features=None,
                        norm='l2',
                        max_df=0.5,
                        min_df=0.01, strip_accents='unicode',
                        sublinear_tf=1
            )

            X_train = vectorizer.fit_transform(X_train)
            X_test = vectorizer.transform(X_test)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            pred_probs = model.predict_proba(X_test).A

            outputs_list.append(pred_probs[:, reorder])

        X_train, X_test, y_train, y_test = train_test_split(
                X, y_all[label_cols].values, test_size=split_size,
                random_state=seed)

        ens_out = np.mean(outputs_list, axis=0)
        labels_out = 1 * (ens_out > 0.5)

        all_acc['acc'] = 100*metrics.accuracy_score(y_test, labels_out)
        all_acc['micro'] = 100*metrics.f1_score(
            y_test, labels_out, average='micro')
        all_acc['macro'] = 100*metrics.f1_score(
            y_test, labels_out, average='macro')
        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, labels_out)
        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, ens_out)
        temp.append(all_acc)

    df_final = pd.DataFrame(temp)
    print(f'Metrics : \n{df_final.mean(axis=0)}')
    df_final.to_csv(f'../results/ATI/{function_name}.csv')


def run_BR_opt_models(
        X, y_all, models_dict, label_cols,
        function_name, seeds=list(range(2)), split_size=0.2):
    """
    Runs Binary Relevance models and saves results in file

    Inputs
        X : vectorized cleaned text of requests
        y_all : target labels (train and test)
        models_dict : dictionary with each model and its hyperparemeters
        label_cols : list of labels' names
        function_name : name of the function
        seeds : seeds to generate random numbers (optional)
        split_size : percentage of test data (optional, default = 0.2)

    """
    temp = []
    num_test_req = ceil(split_size * y_all.shape[0])
    for i, seed in enumerate(seeds):
        all_acc = {}
        pred_all = np.empty((num_test_req, len(label_cols)))
        prob_all = np.empty((num_test_req, len(label_cols)))

        print(f'Iteration {i+1} of {len(seeds)}')
        for k, label in enumerate(label_cols):
            np.random.seed(42)
            random.seed(42)

            print(f'Running {function_name}, label = {label}')

            model = deepcopy(models_dict[label])
            X_train, X_test, y_train, y_test = train_test_split(
                X, y_all[label].values, test_size=split_size,
                random_state=seed)
            vectorizer = TfidfVectorizer(
                ngram_range=(1, 2),
                tokenizer=tokenize,
                encoding='utf-8',
                use_idf=True,
                smooth_idf=True,
                max_features=None,
                norm='l2',
                max_df=0.5,
                min_df=0.01, strip_accents='unicode',
                sublinear_tf=1
            )

            X_train = vectorizer.fit_transform(X_train)
            X_test = vectorizer.transform(X_test)

            if function_name == 'SMOTE':
                sm = SMOTE(random_state=42)
                X_train, y_train = sm.fit_sample(X_train, y_train)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            pred_all[:, k] = y_pred
            prob_all[:, k] = model.predict_proba(X_test)[:, 1]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y_all[label_cols].values, test_size=split_size, random_state=seed)
        print(f'{function_name} \n')

        all_acc['acc'] = 100*metrics.accuracy_score(y_test, pred_all)
        all_acc['micro'] = 100*metrics.f1_score(
            y_test, pred_all, average='micro')
        all_acc['macro'] = 100*metrics.f1_score(
            y_test, pred_all, average='macro')
        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, pred_all)
        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, prob_all)
        temp.append(all_acc)

    df_final = pd.DataFrame(temp)

    print(f'Metrics : \n{df_final.mean(axis=0)}')

    df_final.to_csv(f'../results/ATI/{function_name}.csv')


def run_BR_opt_th(
        X, y_all, label_cols,
        function_name, opt_thresholds=opt_thresholds,
        seeds=list(range(2)), split_size=0.2):

    """
    Runs BR with optimized thresholds and saves results in file

    Inputs
        X : vectorized cleaned text of requests
        y_all : target labels (train and test)
        label_cols : list of labels' names
        function_name : name of the function
        opt_thresholds : list of optimized thresholds for each label
        seeds : seeds to generate random numbers (optional)
        split_size : percentage of test data (optional, default = 0.2)

    """
    temp = []
    num_test_req = ceil(split_size * y_all.shape[0])
    for i, seed in enumerate(seeds):

        all_acc = {}
        print(f'Iteration {i+1} of {len(seeds)}')
        print(f'Running {function_name}')

        model = BinaryRelevance(LogisticRegression(
            C=1, solver='lbfgs', max_iter=500))

        X_train, X_test, y_train, y_test = train_test_split(
            X, y_all[label_cols].values, test_size=split_size,
            random_state=seed)
        vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            tokenizer=tokenize,
            encoding='utf-8',
            use_idf=True,
            smooth_idf=True,
            max_features=None,
            norm='l2',
            max_df=0.5,
            min_df=0.01, strip_accents='unicode',
            sublinear_tf=1
        )

        opt_list = []
        for label in label_cols:
            opt_list.append(opt_thresholds[label])

        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        model.fit(X_train, y_train)
        pred_probs = model.predict_proba(X_test)
        y_pred = (pred_probs > np.array(opt_list))*1

        all_acc['acc'] = 100*metrics.accuracy_score(y_test, y_pred.A)
        all_acc['micro'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='micro')
        all_acc['macro'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='macro')
        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, y_pred.A)
        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, pred_probs.A)
        temp.append(all_acc)

    df_final = pd.DataFrame(temp)
    print(f'Metrics : \n{df_final.mean(axis=0)}')
    df_final.to_csv(f'../results/ATI/{function_name}.csv')


def run_multilabel(
        X, y_all, label_cols,
        function_name, seeds=list(range(2)), split_size=0.2):

    """
    Runs Multilabel model according to function_name parameter
    Saves results in file

    Inputs
        X : vectorized cleaned text of requests
        y_all : target labels (train and test)
        label_cols : list of labels' names
        function_name : name of the function
        seeds : seeds to generate random numbers (optional)
        split_size : percentage of test data (optional, default = 0.2)

    """
    temp = []
    num_test_req = ceil(split_size * y_all.shape[0])
    for i, seed in enumerate(seeds):

        all_acc = {}
        print(f'Iteration {i+1} of {len(seeds)}')
        print(f'Running {function_name}')

        if function_name == 'CC':
            model = ClassifierChain(LogisticRegression(
                C=1, solver='lbfgs', max_iter=500))
        elif function_name == 'BR':
            model = BinaryRelevance(LogisticRegression(
                C=1, solver='lbfgs', max_iter=500))
        elif function_name == 'MLKNN':
            model = MLkNN(k=2,s=0.5)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y_all[label_cols].values, test_size=split_size,
            random_state=seed)

        vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            tokenizer=tokenize,
            encoding='utf-8',
            use_idf=True,
            smooth_idf=True,
            max_features=None,
            norm='l2',
            max_df=0.5,
            min_df=0.01, strip_accents='unicode',
            sublinear_tf=1
        )

        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        pred_probs = model.predict_proba(X_test)

        all_acc['acc'] = 100*metrics.accuracy_score(y_test, y_pred.A)
        all_acc['micro'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='micro')
        all_acc['macro'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='macro')
        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, y_pred.A)
        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, pred_probs.A)
        temp.append(all_acc)

    df_final = pd.DataFrame(temp)
    print(f'Metrics : \n{df_final.mean(axis=0)}')
    df_final.to_csv(f'../results/ATI/{function_name}.csv')


def run_rakel(
        X, y_all, label_cols,
        function_name, label_partition,
        seeds=list(range(2)), split_size=0.2):
    """
    Runs RAKEL classifier and saves results in file

    Inputs
        X : vectorized cleaned text of requests
        y_all : target labels (train and test)
        label_cols : list of labels' names
        label_partition : number of times that the label set
                          will be divided
        function_name : name of the function
        seeds : seeds to generate random numbers (optional)
        split_size : percentage of test data (optional, default = 0.2)

    """
    temp = []
    num_test_req = ceil(split_size * y_all.shape[0])
    for i, seed in enumerate(seeds):

        all_acc = {}
        print(f'Iteration {i+1} of {len(seeds)}')
        print(f'Running {function_name}')

        X_train, X_test, y_train, y_test = train_test_split(
            X, y_all[label_cols].values, test_size=split_size,
            random_state=seed)
        if label_partition != 4:
            np.random.seed(42)
            random.seed(42)
            model = RakelD(
                base_classifier=LogisticRegression(max_iter=1000),
                base_classifier_require_dense=[True, True],
                labelset_size=y_train.shape[1] // int(label_partition)
                )
        else:
            np.random.seed(42)
            random.seed(42)
            model = RakelD(
                base_classifier=LogisticRegression(C=0.25, max_iter=1000),
                base_classifier_require_dense=[True, True],
                labelset_size=y_train.shape[1] // int(label_partition)
                )

        vectorizer = TfidfVectorizer(
            ngram_range=(1, 2),
            tokenizer=tokenize,
            encoding='utf-8',
            use_idf=True,
            smooth_idf=True,
            max_features=None,
            norm='l2',
            max_df=0.5,
            min_df=0.01, strip_accents='unicode',
            sublinear_tf=1
        )

        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.transform(X_test)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        pred_probs = model.predict_proba(X_test)

        all_acc['acc'] = 100*metrics.accuracy_score(y_test, y_pred.A)
        all_acc['micro'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='micro')
        all_acc['macro'] = 100*metrics.f1_score(
            y_test, y_pred.A, average='macro')
        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, y_pred.A)
        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, pred_probs.A)
        temp.append(all_acc)

    if label_partition == 1:
        function_name = 'LP'
    df_final = pd.DataFrame(temp)
    print(f'Metrics : \n{df_final.mean(axis=0)}')
    df_final.to_csv(f'../results/ATI/{function_name}.csv')


def train_NN(
        X, y_all, label_cols, function_name='CNN', seeds=list(range(2))):

    """
    Runs Neural Network classifier and saves results in file

    Inputs
        X : cleaned text of requests
        y_all : target labels (train and test)
        label_cols : list of labels' names
        function_name : name of the function
        seeds : seeds to generate random numbers (optional)

    """
    vocab_size = 10000
    embedding_dim = 32
    max_length = 400
    trunc_type = 'post'
    padding_type = 'post'
    oov_tok = "<OOV>"

    all_results = []
    for i, seed in enumerate(seeds):
        print(f'Iteration {i+1} of {len(seeds)}')
        print(f'Running {function_name}')
        set_seed(42)
        random.seed(42)
        np.random.seed(42)

        X_train, X_test, y_train, y_test = train_test_split(
                  X, y_all[label_cols].values, test_size=0.2,
                  random_state=seed)

        tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
        tokenizer.fit_on_texts(X_train.values.tolist())

        # transforming the words into vectors
        X_train = tokenizer.texts_to_sequences(X_train)
        # forcing all requests to have the same length (padding)
        training_padded = pad_sequences(
            X_train, maxlen=max_length,
            padding=padding_type, truncating=trunc_type)
        X_test = tokenizer.texts_to_sequences(X_test)
        testing_padded = pad_sequences(
            X_test, maxlen=max_length,
            padding=padding_type, truncating=trunc_type)

        # trains CNN
        if function_name == 'CNN':
            model = tf.keras.Sequential([
                tf.keras.layers.Embedding(
                    vocab_size, embedding_dim, input_length=max_length),
                tf.keras.layers.Conv1D(32, 3, activation='relu'),
                tf.keras.layers.GlobalAveragePooling1D(),
                tf.keras.layers.Flatten(),
                tf.keras.layers.Dense(32, activation='relu'),
                tf.keras.layers.Dropout(0.2),
                tf.keras.layers.Dense(len(label_cols), activation='sigmoid')])
        # trains LSTM
        elif function_name == 'LSTM':
            model = tf.keras.Sequential([
                tf.keras.layers.Embedding(
                    vocab_size, embedding_dim, input_length=max_length),
                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
                tf.keras.layers.Dense(32, activation='relu'),
                tf.keras.layers.Dropout(0.2),
                tf.keras.layers.Dense(len(label_cols), activation='sigmoid')])

        model.compile(
            loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        num_epochs = 250
        history = model.fit(training_padded, y_train, epochs=num_epochs,
                            validation_data=(
                                testing_padded, y_test), verbose=0)
        preds = model.predict(testing_padded)

        # from probabilities to 1's or 0's
        labels_out = (preds > 0.5)*1

        all_acc = {}

        all_acc['acc'] = 100*metrics.accuracy_score(y_test, labels_out)
        all_acc['micro'] = 100*metrics.f1_score(
            y_test, labels_out, average='micro')
        all_acc['macro'] = 100*metrics.f1_score(
            y_test, labels_out, average='macro')
        all_acc['hamming'] = 100*metrics.hamming_loss(y_test, labels_out)
        all_acc['r_loss'] = metrics.label_ranking_loss(y_test, preds)
        all_results.append(all_acc)

        model.save(f"model_lstm_{seed}.h5")  # saves the model

    df_final = pd.DataFrame(all_results)
    print(f'Metrics : \n{df_final.mean(axis=0)}')
    df_final.to_csv(f'../results/ATI/{function_name}.csv')


def run_all(X, y_all, label_cols, seeds=list(range(10))):
    """ Runs all models described in the paper """

    print('Running all models \n')
    function_name = 'optimized_models'
    run_BR_opt_models(
            X, y_all, models_dict, label_cols, function_name, seeds=seeds)
    function_name = 'SMOTE'
    run_BR_opt_models(
            X, y_all, models_dict, label_cols, function_name, seeds=seeds)

    function_name = 'optimized_thresholds'
    run_BR_opt_th(X, y_all, label_cols, function_name)

    function_name = 'CC'
    run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)

    function_name = 'BR'
    run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)

    function_name = 'MLKNN'
    run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)

    function_name = 'rakel'
    run_rakel(X, y_all, label_cols, function_name, 4, seeds=seeds)
    run_rakel(X, y_all, label_cols, function_name, 2, seeds=seeds)
    run_rakel(X, y_all, label_cols, function_name, 1, seeds=seeds)

    function_name = 'ECC'
    run_ECC(X, y_all, label_cols, function_name, seeds=seeds)

    function_name = 'CNN'
    train_NN(X, y_all, label_cols, function_name, seeds=seeds)

    function_name = 'LSTM'
    train_NN(X, y_all, label_cols, function_name, seeds=seeds)


def main(argv, seeds=list(range(10))):

    X = pd.read_csv('../data_clean/text.csv')
    X = X.Clean_Text
    function_name = sys.argv[1]
    y_all = pd.read_csv(('../data_clean/labels.csv'))
    label_cols = [name for name in models_dict]

    print(f'Running classifers for labels: \n{ label_cols}')
    if function_name == 'all':
        run_all(X, y_all, label_cols, seeds=seeds)
    elif function_name == 'optimized_thresholds':
        run_BR_opt_th(X, y_all, label_cols, function_name, seeds=seeds)
    elif function_name == 'optimized_models':
        run_BR_opt_models(
                X, y_all, models_dict, label_cols, function_name, seeds=seeds)
    elif function_name == 'SMOTE':
        run_BR_opt_models(
                X, y_all, models_dict, label_cols, function_name, seeds=seeds)
    elif function_name == 'CC' or function_name == 'BR' or function_name == 'MLKNN':
        run_multilabel(X, y_all, label_cols, function_name, seeds=seeds)   
    elif function_name == 'rakel_4':
        run_rakel(X, y_all, label_cols, function_name, 4, seeds=seeds)
    elif function_name == 'rakel_2':
        run_rakel(X, y_all, label_cols, function_name, 2, seeds=seeds)
    elif function_name == 'LP':
        run_rakel(X, y_all, label_cols, function_name, 1, seeds=seeds)
    elif function_name == 'ECC':
        run_ECC(X, y_all, label_cols, function_name, seeds=seeds)
    elif function_name == 'LSTM' or function_name == 'CNN':
        train_NN(X, y_all, label_cols, function_name, seeds=seeds)
    else:
        print('Please enter one of the following methods:\n  all,optimized_thresholds,\
            optimized_models, SMOTE, BR, rakel_4, rakel_2, \
            LSTM, CNN, LP, CC or ECC')

if __name__ == "__main__":
    print(f'Running: {sys.argv[1]}')
    main(sys.argv)
